Data Backfill

The goal of the backfill process is to pull the historical data for the required series using the settings.json file. This includes the following steps:

Load Libraries

Code
import eia_api as api
import eia_data 
import pandas as pd
import numpy as np
import requests
import json
import os
import datetime
import plotly.express as px
import plotly.graph_objects as go
Code
raw_json = open("./settings/settings.json")
meta_json = json.load(raw_json)
series = pd.DataFrame(meta_json["series"])
api_path = meta_json["api_path"]
Code
facets_template = {
  "parent" : None,
  "subba" : None
}

start = datetime.datetime(meta_json["start"]["year"],
 meta_json["start"]["month"], 
 meta_json["start"]["day"], 
 meta_json["start"]["hour"])

end = datetime.datetime(meta_json["end"]["year"],
 meta_json["end"]["month"], 
 meta_json["end"]["day"], 
 meta_json["end"]["hour"])


offset = 2250

eia_api_key = os.getenv('EIA_API_KEY')

meta_path = meta_json["meta_path"]
data_path = meta_json["data_path"]
series_mapping_path = meta_json["series_mapping_path"]
Code
metadata = api.eia_metadata(api_key = eia_api_key, api_path = api_path)
print(metadata.meta.keys())
print(metadata.meta["startPeriod"])
print(metadata.meta["endPeriod"])
dict_keys(['id', 'name', 'description', 'frequency', 'facets', 'data', 'startPeriod', 'endPeriod', 'defaultDateFormat', 'defaultFrequency'])
2019-01-01T00
2024-10-26T07
Code
for i in series.index:
    facets = facets_template
    facets["parent"] = series.at[i, "parent_id"]
    facets["subba"] = series.at[i, "subba_id"]
    print(facets)
    temp = api.eia_backfill(api_key = eia_api_key, 
          api_path = api_path+ "data", 
          facets = facets, 
          start = start,
          end = end,
          offset = offset) 
    ts_obj = pd.DataFrame(np.arange(start = start, stop = end + datetime.timedelta(hours = 1), step = datetime.timedelta(hours = 1)).astype(datetime.datetime), columns=["index"])
    ts_obj  = ts_obj.merge(temp.data, left_on = "index", right_on = "period", how="left")
    ts_obj.drop("period", axis = 1, inplace= True)
    ts_obj = ts_obj.rename(columns= {"index": "period"})
    missing_index = ts_obj[ts_obj["value"].isnull()].index.tolist()
    if len(missing_index) > 0:
      ts_obj.loc[missing_index,"subba"] = facets["subba"]   
    meta_temp = eia_data.create_metadata(data = ts_obj, start = start, end = end, type = "backfill")
    meta_temp["index"] = 1
    meta_df = pd.DataFrame([meta_temp]) 
    if i == series.index.start:
      data = ts_obj
      meta = meta_df
    else:
      data = data._append(ts_obj)
      meta = meta._append(meta_df)

data.reset_index(drop=True, inplace=True)
meta.reset_index(drop=True, inplace=True)
{'parent': 'NYIS', 'subba': 'ZONA'}
{'parent': 'NYIS', 'subba': 'ZONB'}
{'parent': 'NYIS', 'subba': 'ZONC'}
{'parent': 'NYIS', 'subba': 'ZOND'}
{'parent': 'NYIS', 'subba': 'ZONE'}
{'parent': 'NYIS', 'subba': 'ZONF'}
{'parent': 'NYIS', 'subba': 'ZONG'}
{'parent': 'NYIS', 'subba': 'ZONH'}
{'parent': 'NYIS', 'subba': 'ZONI'}
{'parent': 'NYIS', 'subba': 'ZONJ'}
{'parent': 'NYIS', 'subba': 'ZONK'}
Code
print(meta)
# The initial pull has some missing values

print("Missing Values:" , data["value"].isna().sum())

data.head()
    index parent subba                             time      start  \
0       1   NYIS  ZONA 2024-10-26 11:23:03.092152+00:00 2022-01-01   
1       1   NYIS  ZONB 2024-10-26 11:23:13.342822+00:00 2022-01-01   
2       1   NYIS  ZONC 2024-10-26 11:23:27.939967+00:00 2022-01-01   
3       1   NYIS  ZOND 2024-10-26 11:23:41.014740+00:00 2022-01-01   
4       1   NYIS  ZONE 2024-10-26 11:23:54.614723+00:00 2022-01-01   
5       1   NYIS  ZONF 2024-10-26 11:24:07.774465+00:00 2022-01-01   
6       1   NYIS  ZONG 2024-10-26 11:24:20.903398+00:00 2022-01-01   
7       1   NYIS  ZONH 2024-10-26 11:24:33.983207+00:00 2022-01-01   
8       1   NYIS  ZONI 2024-10-26 11:24:47.645812+00:00 2022-01-01   
9       1   NYIS  ZONJ 2024-10-26 11:25:02.294246+00:00 2022-01-01   
10      1   NYIS  ZONK 2024-10-26 11:25:15.390137+00:00 2022-01-01   

                   end  start_act             end_act  start_match  end_match  \
0  2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   
1  2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   
2  2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   
3  2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   
4  2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   
5  2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   
6  2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   
7  2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   
8  2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   
9  2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   
10 2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   

    n_obs  na  imputed      type  update  success                     comments  
0   24098   1        0  backfill   False    False  Missing values were found;   
1   24098   1        0  backfill   False    False  Missing values were found;   
2   24098   1        0  backfill   False    False  Missing values were found;   
3   24098   1        0  backfill   False    False  Missing values were found;   
4   24098   1        0  backfill   False    False  Missing values were found;   
5   24098   1        0  backfill   False    False  Missing values were found;   
6   24098   1        0  backfill   False    False  Missing values were found;   
7   24098   1        0  backfill   False    False  Missing values were found;   
8   24098   1        0  backfill   False    False  Missing values were found;   
9   24098   1        0  backfill   False    False  Missing values were found;   
10  24098   1        0  backfill   False    False  Missing values were found;   
Missing Values: 11
period subba subba-name parent parent-name value value-units
0 2022-01-01 00:00:00 ZONA West - NYIS NYIS New York Independent System Operator 1707.0 megawatthours
1 2022-01-01 01:00:00 ZONA West - NYIS NYIS New York Independent System Operator 1673.0 megawatthours
2 2022-01-01 02:00:00 ZONA West - NYIS NYIS New York Independent System Operator 1644.0 megawatthours
3 2022-01-01 03:00:00 ZONA West - NYIS NYIS New York Independent System Operator 1605.0 megawatthours
4 2022-01-01 04:00:00 ZONA West - NYIS NYIS New York Independent System Operator 1550.0 megawatthours

Impute Missing Values

Code
def impute_missing(input, var, index):
    class imputed_values:
        def __init__(self, data, missing_index, num_imputed):
            self.data = data
            self.missing_index = missing_index
            self.num_imputed = num_imputed

    input["impute"] = np.NaN
    input = input.sort_values(by = [index])
    missing_index = input[input[var].isnull()].index.tolist()
    non_missing_index = input.index.difference(missing_index).tolist()
    num_imputed = 0
    for i in missing_index:
        if i > 336:
            input.loc[i ,"impute"] = (input.loc[i - 336 ,var] + input.loc[i - 168 ,var] + input.loc[i - 24 ,var]) / 3
            num_imputed = num_imputed + 1
        elif i > 168:
            input.loc[i ,"impute"] = (input.loc[i - 168 ,var] + input.loc[i - 24 ,var]) / 2
            num_imputed = num_imputed + 1
        elif i > 24:
            input.loc[i ,"impute"] = input.loc[i - 24 ,var]
            num_imputed = num_imputed + 1
        else:
            print("There are not enough observations to impute observation:", i)
        input["y"] = np.NaN

    input.loc[missing_index, "y"] = input.loc[missing_index, "impute"]
    input.loc[non_missing_index, "y"] = input.loc[non_missing_index, var]
    output = imputed_values(data = input, missing_index = missing_index, num_imputed = num_imputed)
    return output
Code
def impute_series(series, meta):
    class imputed_series:
        def __init__(self, data, metadata):
            self.data = data
            self.metadata = metadata
    new_df = None
    for index, row in meta.iterrows():
        s = row["subba"]
        temp = None
        temp = series[series["subba"] == s]
        if row["na"] > 0:
            print("Series", s, "has", row["na"], "missing values")
            imputed = impute_missing(input = temp, var = "value", index = "period")
            if imputed.num_imputed > 0:
                temp = imputed.data
                meta.loc[index, "imputed"] = imputed.num_imputed
                meta.loc[index, "comments"] = meta.loc[index, "comments"] + " Missing values were imputed"
            else:
                temp["impute"] = np.NaN
                temp["y"] = temp["value"]
        else:
            temp["impute"] = np.NaN
            temp["y"] = temp["value"]
        if all([meta.loc[index, "na"] == meta.loc[index, "imputed"],meta.loc[index, "start_match"],meta.loc[index, "end_match"]]):
            meta.loc[index, "success"] = True
            meta.loc[index, "update"] = True
        if meta.loc[index, "success"]:
            if new_df is None:
                new_df = temp
            else:
                new_df = pd.concat([new_df, temp])
    if new_df is not None:
        output = imputed_series(data = new_df, metadata = meta)
    else: 
        output = None
    return output
Code
ts = impute_series(series= data, meta = meta)
Series ZONA has 1 missing values
Series ZONB has 1 missing values
Series ZONC has 1 missing values
Series ZOND has 1 missing values
Series ZONE has 1 missing values
Series ZONF has 1 missing values
/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Series ZONG has 1 missing values
Series ZONH has 1 missing values
Series ZONI has 1 missing values
Series ZONJ has 1 missing values
Series ZONK has 1 missing values
/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Plot the Series

We will use Plotly to visualize the series:

Code
fig = go.Figure()

for i in ts.data["subba"].unique():
    d = None
    d = ts.data[ts.data["subba"] == i]
    fig.add_trace(go.Scatter(x=d["period"], 
    y=d["value"], 
    name = i,
    # line = dict(color = "blue"),
    mode='lines'))
    


fig.add_trace(go.Scatter(
    x=ts.data["period"], 
    y=ts.data["impute"],
    mode='markers',  
    name = "Imputed",
    marker=dict(size=4, symbol='square', color = "red")
    )
    )

fig.update_layout(title = "New York Independent System Operator -  Demand for Electricity")
fig

Save the Data and Metadata

Code
if all(meta["success"]) and all(meta["update"]):
    print("Save the data into CSV file")
    df = ts.data[["period", "subba", "impute", "y"]]
    d = eia_data.append_data(data_path = data_path, new_data = df, init = True, save = True)
    series_meta = ts.data[["subba", "subba-name","parent-name", "value-units"]].drop_duplicates().dropna().sort_values(by = ["subba"])
    series_meta.reset_index(drop=True, inplace=True)
    series_meta.to_csv(series_mapping_path, index = False)
print("Save the meatadata into CSV files")
m = eia_data.append_metadata(meta_path = meta_path, meta = meta, save = True, init = True)
print(m)
Save the data into CSV file
Initial data pull
Save the data to CSV file
Save the meatadata into CSV files
    index parent subba                             time      start  \
0       1   NYIS  ZONA 2024-10-26 11:23:03.092152+00:00 2022-01-01   
1       1   NYIS  ZONB 2024-10-26 11:23:13.342822+00:00 2022-01-01   
2       1   NYIS  ZONC 2024-10-26 11:23:27.939967+00:00 2022-01-01   
3       1   NYIS  ZOND 2024-10-26 11:23:41.014740+00:00 2022-01-01   
4       1   NYIS  ZONE 2024-10-26 11:23:54.614723+00:00 2022-01-01   
5       1   NYIS  ZONF 2024-10-26 11:24:07.774465+00:00 2022-01-01   
6       1   NYIS  ZONG 2024-10-26 11:24:20.903398+00:00 2022-01-01   
7       1   NYIS  ZONH 2024-10-26 11:24:33.983207+00:00 2022-01-01   
8       1   NYIS  ZONI 2024-10-26 11:24:47.645812+00:00 2022-01-01   
9       1   NYIS  ZONJ 2024-10-26 11:25:02.294246+00:00 2022-01-01   
10      1   NYIS  ZONK 2024-10-26 11:25:15.390137+00:00 2022-01-01   

                   end  start_act             end_act  start_match  end_match  \
0  2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   
1  2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   
2  2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   
3  2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   
4  2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   
5  2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   
6  2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   
7  2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   
8  2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   
9  2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   
10 2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00         True       True   

    n_obs  na  imputed      type  update  success  \
0   24098   1        1  backfill    True     True   
1   24098   1        1  backfill    True     True   
2   24098   1        1  backfill    True     True   
3   24098   1        1  backfill    True     True   
4   24098   1        1  backfill    True     True   
5   24098   1        1  backfill    True     True   
6   24098   1        1  backfill    True     True   
7   24098   1        1  backfill    True     True   
8   24098   1        1  backfill    True     True   
9   24098   1        1  backfill    True     True   
10  24098   1        1  backfill    True     True   

                                             comments  
0   Missing values were found;  Missing values wer...  
1   Missing values were found;  Missing values wer...  
2   Missing values were found;  Missing values wer...  
3   Missing values were found;  Missing values wer...  
4   Missing values were found;  Missing values wer...  
5   Missing values were found;  Missing values wer...  
6   Missing values were found;  Missing values wer...  
7   Missing values were found;  Missing values wer...  
8   Missing values were found;  Missing values wer...  
9   Missing values were found;  Missing values wer...  
10  Missing values were found;  Missing values wer...